{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "sys.path.append(os.path.abspath('../../code'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import regex\n",
    "import json\n",
    "from collections import Counter\n",
    "\n",
    "import numpy as np\n",
    "np.set_printoptions(formatter={'float': lambda x: \"{0:0.3f}\".format(x)})\n",
    "import pandas as pd\n",
    "pd.set_option('display.max_columns', 15)\n",
    "pd.set_option('display.width', 320)\n",
    "\n",
    "from utils.io import read_label_config\n",
    "from utils.corpus import DoccanoAnnotationsCorpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from types import SimpleNamespace\n",
    "\n",
    "args = SimpleNamespace()\n",
    "args.data_path = '../../data/annotation/annotations/'\n",
    "args.data_folder_pattern = 'uk-commons'\n",
    "args.data_annotations_folder = 'annotations'\n",
    "args.data_file_format = 'jsonl'\n",
    "args.keep_annotator = 'emarie,sjasmin'\n",
    "\n",
    "args.label_config_file = '../../data/annotation/doccano_label_config.json'\n",
    "args.label_review_config_file = '../../data/annotation/doccano_labels_review_config.json'\n",
    "\n",
    "args.output_file = '../../data/annotation/parsed/uk-commons_annotations.jsonl'\n",
    "args.overwrite_output = False\n",
    "\n",
    "args.verbose = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "subdirs = [os.path.join(args.data_path, d, args.data_annotations_folder) for d in os.listdir(args.data_path) if d.startswith(args.data_folder_pattern)]\n",
    "annotators = [a.strip() for a in args.keep_annotator.split(',')]\n",
    "fps = [os.path.join(d, a+'.'+args.data_file_format) for a in annotators for d in subdirs]\n",
    "fps.sort()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read the label config\n",
    "cat2code = read_label_config(args.label_config_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read first (we merge the rest to this one)\n",
    "acorp = DoccanoAnnotationsCorpus(cat2code)\n",
    "annotator = os.path.basename(fps[0]).replace('.jsonl', '')\n",
    "acorp.load_from_jsonlines(fp=fps[0], annotator_id=annotator, verbose=args.verbose)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Reading annotations from file '../../data/annotation/annotations/uk-commons/annotations/sjasmin.jsonl'\n"
     ]
    }
   ],
   "source": [
    "# merge remaining ones\n",
    "for fp in fps[1:]:\n",
    "    if args.verbose: print(f'Reading annotations from file \\'{fp}\\'')\n",
    "    tmp = DoccanoAnnotationsCorpus(cat2code)\n",
    "    annotator = os.path.basename(fp).replace('.jsonl', '')\n",
    "    tmp.load_from_jsonlines(fp=fp, annotator_id=annotator, verbose=args.verbose)\n",
    "    acorp.merge_annotated_corpus(tmp)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### merge gold labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "fp = os.path.join(args.data_path, 'ra-annotation-uk-commons-review', 'annotations', 'all.jsonl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# read the label config\n",
    "cats = ['SG', 'PG', 'PI', 'ORG', 'ISG', 'unsure']\n",
    "cats = [t+c for t in ['I-', 'B-'] for c in cats]\n",
    "cat2code_gold = {l+a: c+1 for a in ['-a', '-z'] for c, l in enumerate(cats)}\n",
    "cat2code_gold['O'] = acorp.outside_label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "gold_corp = DoccanoAnnotationsCorpus(cat2code_gold, n_types=6)\n",
    "gold_corp.load_from_jsonlines(fp=fp, annotator_id='GOLD', verbose=args.verbose)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# check that doc IDs in gold data match those in the annotations\n",
    "all([doc_id in acorp.doc_ids for doc_id in gold_corp.doc_ids])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge the gold labels to the annotations corpus\n",
    "acorp.merge_gold_corpus(gold_corp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[1m00b92b011f1a6c13fcaa47cbf1a3915e\u001b[0m\n",
      "'I know he will share the House ' s concern about the number of young people coming forward who have been victims of horrendous abuse .'\n",
      "                      \u001b[44m   \u001b[49m \u001b[44m     \u001b[49m                                 \u001b[44m     \u001b[49m \u001b[44m      \u001b[49m \u001b[44m      \u001b[49m \u001b[44m       \u001b[49m \u001b[44m   \u001b[49m \u001b[44m    \u001b[49m \u001b[44m    \u001b[49m \u001b[44m       \u001b[49m \u001b[44m  \u001b[49m \u001b[44m          \u001b[49m \u001b[44m     \u001b[49m  \t(emarie)\n",
      "                                                                \u001b[44m     \u001b[49m \u001b[44m      \u001b[49m                              \u001b[44m       \u001b[49m \u001b[44m  \u001b[49m \u001b[44m          \u001b[49m \u001b[44m     \u001b[49m  \t(sjasmin)\n",
      "                      \u001b[42m   \u001b[49m \u001b[42m     \u001b[49m                                 \u001b[42m     \u001b[49m \u001b[42m      \u001b[49m \u001b[42m      \u001b[49m \u001b[42m       \u001b[49m \u001b[42m   \u001b[49m \u001b[42m    \u001b[49m \u001b[42m    \u001b[49m \u001b[42m       \u001b[49m \u001b[42m  \u001b[49m \u001b[42m          \u001b[49m \u001b[42m     \u001b[49m  \t[GOLD]\n"
     ]
    }
   ],
   "source": [
    "print(acorp.docs[acorp.doc_id2idx[gold_corp.doc_ids[1]]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No. docs: 1574\n",
      "(array([1, 2]), array([1050,  524]))\n"
     ]
    }
   ],
   "source": [
    "print('No. docs:', acorp.ndocs)\n",
    "# how many singly/multiply annotated?\n",
    "print(np.unique(np.asarray([doc.n_annotations for doc in acorp.docs]), return_counts=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# identify duplicate texts (if any)\n",
    "texts = Counter()\n",
    "for doc in acorp.docs:\n",
    "    texts.update([doc.text])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([1, 2]), array([1572,    1]))\n"
     ]
    }
   ],
   "source": [
    "print(np.unique(np.asarray(list(texts.values())), return_counts=True))\n",
    "# 1 sentence is a verbatim duplicate (possible because we sampled based on within-manifesto sentence IDs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get IDs of documents with dublicated text\n",
    "duplicated = [t for t, n in texts.most_common() if n > 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# map doc IDs to texts\n",
    "duplicates_ids = dict()\n",
    "for doc in acorp.docs:\n",
    "    if doc.text in duplicated:\n",
    "        if doc.text in duplicates_ids.keys():\n",
    "            duplicates_ids[doc.text].append(doc.id)\n",
    "        else:\n",
    "            duplicates_ids[doc.text] = [doc.id]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "----------------------------------------------------------------------------------------------------\n",
      "\u001b[1ma530fc5e897883be9be95e6ff301842e\u001b[0m\n",
      "'What recent assessment she has made of the effect of the UK leaving the EU on the progress of talks on restoring devolution in Northern Ireland .'\n",
      "                                                                     \u001b[44m   \u001b[49m \u001b[44m  \u001b[49m                                                                       \t(sjasmin)\n",
      "\u001b[1m07bfd6c84afd3bb57f6131029ab6db73\u001b[0m\n",
      "'What recent assessment she has made of the effect of the UK leaving the EU on the progress of talks on restoring devolution in Northern Ireland .'\n",
      "                                                                     \u001b[44m   \u001b[49m \u001b[44m  \u001b[49m                                                                       \t(sjasmin)\n"
     ]
    }
   ],
   "source": [
    "# print\n",
    "if args.verbose:\n",
    "    for ids in duplicates_ids.values():\n",
    "        print('\\n', '-'*100, sep='')\n",
    "        for id in ids:\n",
    "            print(acorp.docs[acorp.doc_id2idx[id]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# note: I manually disambiguate (no problem beause of identical annotations)\n",
    "disambigute_duplicates = {\n",
    "    'a530fc5e897883be9be95e6ff301842e': ['a530fc5e897883be9be95e6ff301842e', 'a530fc5e897883be9be95e6ff301842e']\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# resolve duplicates: for duplicated texts\n",
    "for ids in duplicates_ids.values():\n",
    "    # see in all but the first doc (the 'original')\n",
    "    if all([id in disambigute_duplicates.values() for id in ids]):\n",
    "        this = [this for this, pair in disambigute_duplicates.items() if all([_ in pair for _ in ids])]\n",
    "        for _ in disambigute_duplicates[this[0]]:\n",
    "            if _ not in this:\n",
    "                acorp.remove_documents([_])\n",
    "    for id in ids[1:]:\n",
    "        # id = ids[1]\n",
    "        # for each annotator\n",
    "        for annotator in acorp.docs[acorp.doc_id2idx[id]].annotators:\n",
    "            # whether the annotator already in the 'original'\n",
    "            if annotator in acorp.docs[acorp.doc_id2idx[ids[0]]].annotators:\n",
    "                # if so remove annotation\n",
    "                acorp.docs[acorp.doc_id2idx[id]].remove_annotation(annotator)\n",
    "        if acorp.docs[acorp.doc_id2idx[id]].n_annotations > 0:\n",
    "            acorp.merge_annotations([ids[0], id])\n",
    "        else:\n",
    "            acorp.remove_documents([id])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(array([1]), array([1573]))\n"
     ]
    }
   ],
   "source": [
    "# verify\n",
    "texts = Counter()\n",
    "for doc in acorp.docs:\n",
    "    texts.update([doc.text])\n",
    "print(np.unique(np.asarray(list(texts.values())), return_counts=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# reset important corpus attributes\n",
    "acorp.doc_id2idx = {doc.id: i for i, doc in enumerate(acorp.docs)}\n",
    "acorp.doc_idx2id = {i: doc.id for i, doc in enumerate(acorp.docs)}\n",
    "acorp.annotator_label_counts = acorp._count_annotator_labels()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## clean tokens"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "toks = set()\n",
    "all_chars = Counter()\n",
    "for doc in acorp.docs:\n",
    "    for tok in doc.tokens:\n",
    "        toks.add(tok)\n",
    "        all_chars.update([c for c in tok])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pd\tDash Punctuation\t['-']\n",
      "Pe\tClose Punctuation\t[')']\n",
      "Pf\tFinal Punctuation\t['”']\n",
      "Pi\tInitial Punctuation\t['“']\n",
      "Po\tOther Punctuation\t['.', ';', ',', '?', \"'\", '&', '%', ':', '!', '/']\n",
      "Ps\tOpen Punctuation\t['(']\n",
      "Sc\tCurrency Symbol\t['£']\n"
     ]
    }
   ],
   "source": [
    "from utils.unicode import CATEGORIES as char_cats\n",
    "\n",
    "del char_cats['Ll']\n",
    "del char_cats['Lu']\n",
    "del char_cats['Nd']\n",
    "\n",
    "for k, v in char_cats.items():\n",
    "    regx = r'\\p{'+k+'}'\n",
    "    m = [c for c in all_chars.keys() if regex.match(regx, c)]\n",
    "    if len(m) > 0:\n",
    "        print(k, end='\\t')\n",
    "        print(v, end='\\t')\n",
    "        print(m)\n",
    "# NOTE: no need to clean"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Write to disk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.makedirs(os.path.dirname(args.output_file), exist_ok=True)\n",
    "                  \n",
    "if not os.path.exists(args.output_file) or args.overwrite_output:\n",
    "    acorp.save_as_jsonlines(args.output_file, encoding='utf-8')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "group_mention_detection",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
